#load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# loading the data
data1 <- read.csv("Ali.csv")
data1$ID <- "Ali"
data2 <- read.csv("Nicole.csv")
data2$ID <- "Nicole"
data3 <- read.csv("Melika.csv")
data3$ID <- "Melika"
data4 <- read.csv("DanielKwakpushup.csv",  header = -1 )
data4$ID <- "Daniel"
data5 <- read.csv("HeChenSensor(HC).csv")
data5$ID <- "HeChen"
data6 <- read.csv("jiaxin(JL).csv")
data6$ID <- "Jiaxin"
data7 <- read.csv("Malik_pushupdata.csv")
data7$ID <- "Malik"
data8 <- read.csv("jiasheng_sensor_update.csv")
data8$ID <- 'Jiasheng'
colnames(data8) <- c("time","wx","wy","wz","ID")
data4 <- data4[-1,-5]
colnames(data4) <- c("time","wx","wy","wz","ID")

Step II

#Option 1 kmeans
#combining and cleaning data, and calculate all average angular speeds for each participant
data <- rbind(data1,data2,data3,data4,data5,data6,data7, data8)
data <- select(data, -1)
data[1:3] <- sapply(data[1:3], as.numeric)
data <- na.omit(data)
subject_data <- data %>% group_by(ID) %>% summarise(AverageX = mean(abs(wx)), AverageY = mean(abs(wy)), AverageZ = mean(abs(wz)))
## `summarise()` ungrouping output (override with `.groups` argument)
subject_data <- arrange(subject_data, ID)

# add counts data to the main dataset
countData <- na.omit(read.csv('Questionnairre.csv'))
countData <- arrange(countData, X)
pushupCounts <- countData$Number.of.Pushups
subject_data$pushupCounts <- pushupCounts

# fit the data by using Kmeans
Kmeans_data <- select(subject_data, c(-1,-5))
Kmeans_data <- scale(Kmeans_data)
fit <- kmeans(Kmeans_data,2)

subject_data$cluster <- as.factor(paste('cluster',fit$cluster))

#Kmeans visual route 1
labels <- paste(subject_data$ID, "(", subject_data$pushupCounts, ")")
plot_ly(subject_data,x=~AverageX, y=~AverageY, z=~AverageZ, type="scatter3d", mode="markers", color=~cluster, text = labels)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#Option 2 kmeans
# Removing time and storing the data as numeric 
data<-data[,-4]
data$wx <- as.numeric(data$wx)
data$wy <- as.numeric(data$wy)
data$wz <- as.numeric(data$wz)

# Running the k-means
K <- scale(data)
k2 <- as.data.frame(na.omit(K))
str(k2)
## 'data.frame':    28344 obs. of  3 variables:
##  $ wx: num  -0.3231 -0.3231 -0.1867 -0.0843 -0.0161 ...
##  $ wy: num  -0.09191 -0.09191 0.11032 -0.00203 -0.22674 ...
##  $ wz: num  0.0682 0.0682 0.0449 0.0216 0.0216 ...
fit <- kmeans(k2, 3)
#fit$cluster

# K means visual route 2 
library(factoextra)
fviz_cluster(fit, data = k2)

Using K-means, can you identify the novices from the experts using only the app data?

ANSWER: Yes. According to the graph, you can see clear clusters on those that are doing well (experts) and those that are not (novices). Then when we compare with the number of push-ups each person did the clusters correspond, thus k-means does work for our activity in showing experts and novices in our group.

Step III

# loading the data
D2 <- read.csv("Questionnairre.csv")
D2 <- as.data.frame(na.omit(D2))
str(D2)
## 'data.frame':    8 obs. of  7 variables:
##  $ X                                                             : chr  "Nicole Schlosberg" "He Chen" "Melika Ghayoomi" "Malik Muftau" ...
##  $ How.difficult.was.the.task.                                   : int  5 3 1 2 1 1 3 5
##  $ How.tired.are.you.after.30.seconds.                           : int  3 3 1 1 1 1 2 2
##  $ How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice.: int  1 2 5 5 4 5 3 1
##  $ How.stable.was.your.posture.                                  : int  2 3 4 5 4 5 4 4
##  $ How.low.to.the.floor.did.you.go.                              : int  1 2 4 5 4 4 3 1
##  $ Number.of.Pushups                                             : int  15 24 15 20 22 20 19 22
##  - attr(*, "na.action")= 'omit' Named int [1:988] 9 10 11 12 13 14 15 16 17 18 ...
##   ..- attr(*, "names")= chr [1:988] "9" "10" "11" "12" ...
# PCA
pca <- prcomp(D2[,c(-1,-7)], scale. = TRUE)
summary(pca)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5
## Standard deviation     2.0827 0.66215 0.34397 0.28738 0.15131
## Proportion of Variance 0.8676 0.08769 0.02366 0.01652 0.00458
## Cumulative Proportion  0.8676 0.95524 0.97890 0.99542 1.00000
plot(pca, type = "lines")

#questions breakdown with PCs
loadings <- abs(pca$rotation) 
loadings
##                                                                      PC1
## How.difficult.was.the.task.                                    0.4396477
## How.tired.are.you.after.30.seconds.                            0.4503110
## How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice. 0.4690327
## How.stable.was.your.posture.                                   0.4111428
## How.low.to.the.floor.did.you.go.                               0.4635729
##                                                                      PC2
## How.difficult.was.the.task.                                    0.5226301
## How.tired.are.you.after.30.seconds.                            0.2973569
## How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice. 0.2370147
## How.stable.was.your.posture.                                   0.7349480
## How.low.to.the.floor.did.you.go.                               0.2052123
##                                                                      PC3
## How.difficult.was.the.task.                                    0.1038287
## How.tired.are.you.after.30.seconds.                            0.8111485
## How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice. 0.1425751
## How.stable.was.your.posture.                                   0.4068149
## How.low.to.the.floor.did.you.go.                               0.3813550
##                                                                      PC4
## How.difficult.was.the.task.                                    0.6916179
## How.tired.are.you.after.30.seconds.                            0.2090643
## How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice. 0.2081644
## How.stable.was.your.posture.                                   0.3539795
## How.low.to.the.floor.did.you.go.                               0.5561681
##                                                                        PC5
## How.difficult.was.the.task.                                    0.210836253
## How.tired.are.you.after.30.seconds.                            0.084433279
## How.do.you.rate.yourself..5.being.an.expert..1.being.a.novice. 0.812509876
## How.stable.was.your.posture.                                   0.003645096
## How.low.to.the.floor.did.you.go.                               0.536873831

What does PC1 represent in your analysis?

ANSWER: PC1 represents all 5 of our questions with relatively equal percentages (43.96%, 45.03%, 46.90%, 41.11%, 46.36%). PC1 composes a relatively high 86.76% (cumulative proportion) of all the variance.

# PC1 visual 
fviz_pca_ind(pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

# PCA visual 
fviz_pca_var(pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

Step IV

If you could only choose one thing to predict the score for each member of your group (raw data variable, a cluster or a PC) what would it be and why?

ANSWER: PC1 would be the best option for predicting the score of each member of the group because it is more representative of each

#Visualizations for answer
#Total variation explained by PCA
library(factoextra)
fviz_eig(pca)

Based on your conclusion devise an intervention that would help novice members of your group improve their performance

ANSWER: Intervention 1 - Novice members could repeat the activity daily for improved results. Intervention 2 - Novice members can do fast pushups in the beginning to make up for the lost energy in the end, which results in less or no pushups.